This notebook explores the use of SingleR
to perform cell-type annotation on datasets from the ScPCA project
SCPCP000007 (Gawad lab data).
Note: The first time you run this code it may take a few more minutes due to reference downloads, but they will be cached for faster future execution.
# load the R project
project_root <- here::here()
renv::load(project_root)
suppressPackageStartupMessages({
library(SingleCellExperiment)
library(SingleR)
library(celldex)
library(ggplot2)
})
theme_set(theme_bw())
set.seed(params$seed) # unclear if this is doing anything? probably not. but maybe later!
utils_dir <- file.path(project_root, "scripts", "utils")
source(file.path(utils_dir, "integration-helpers.R"))
sce_file_suffix <- "processed_citeseq.rds"
integrated_sce_file <- file.path(params$integrated_sce_dir, paste0(
params$project_id, "_integrated_", params$integration_method, "_sce.rds"
))
if (!file.exists(integrated_sce_file)){
stop("Integrated SCE file could not be found.")
}
Read in the data:
library_metadata_df <- readr::read_tsv(params$library_file)
Rows: 25 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (15): project_name, submitter, library_biomaterial_id, sample_biomateria...
lgl (1): has_CITEseq
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
integrated_sce <- readr::read_rds(integrated_sce_file)
# Define the unintegrated SCE filenames
sce_file_paths <- library_metadata_df %>%
dplyr::filter(project_name == params$project_id) %>%
dplyr::mutate(sce_file_path = file.path(
integration_input_dir, sample_biomaterial_id, glue::glue("{library_biomaterial_id}_{sce_file_suffix}")
)) %>%
dplyr::pull(sce_file_path)
SingleR
annotationHere we performn celltype annotation with the given reference in
params$reference on each of the Gawad libraries and look at
their UMAPs colored by celltype.
if (params$reference == "hpca") {
ref_data <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
} else if (params$reference == "blueprint_encode") {
ref_data <- celldex::BlueprintEncodeData(ensembl = TRUE)
} else {
stop("Bad reference parameter; either 'hpca' or 'blueprint_encode'.")
}
snapshotDate(): 2022-04-26
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
snapshotDate(): 2022-04-25
loading from cache
require("ensembldb")
Warning: Unable to map 1470 of 19363 requested IDs.
annotate_SingleR <- function(sce, ref_data) {
# return updated sce and the predictions themselves
preds <- SingleR::SingleR(
test = sce,
ref = ref_data,
labels = ref_data$label.main
)
# Add `pruned.labels`, where low-confidence annotations are NA, to sce
sce$SingleR_annotations <- preds$pruned.labels
return(
list(
sce = sce,
preds = preds
)
)
}
plot_SingleR <- function(annotation_output, colname) {
# annotation_output: list of sce and preds
# colname: column name to plot
# Make a heatmap and UMAP and print out
heatmap <- SingleR::plotScoreHeatmap(annotation_output[["preds"]],
# default but let's be explicit
show.pruned = FALSE)
# In case, for `levels()` below.
colData(annotation_output[["sce"]])[,colname] <- as.factor(colData(annotation_output[["sce"]])[,colname])
umap_df <- tibble::as_tibble(reducedDim(annotation_output[["sce"]], "UMAP")) %>%
dplyr::select(UMAP1 = V1, UMAP2 = V2) %>%
dplyr::mutate(celltypes = colData(annotation_output[["sce"]])[,colname])
# We would probably like a more principled approach to colors that is
# actually based on biological information about celltypes
plot_colors <- rainbow( length(levels(umap_df$celltypes)) )
umap <- ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = celltypes) +
geom_point(size = 0.2, alpha = 0.5) +
scale_color_manual(values = plot_colors) +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle(glue::glue("UMAP with top {params$max_celltypes} celltypes shown"))
print(heatmap)
print(umap)
}
# Function to run and plot SingleR
# Return SCE with annotations `SingleR_annotations` column
run_SingleR <- function(sce,
max_celltypes = params$max_celltypes,
viz = TRUE) {
# Print out the library
print(unique( metadata(sce)$library ))
# Run annotations
anno <- annotate_SingleR(sce, ref_data)
# Create an additional column `SingleR_annotations_collapsed` with only max_celltypes
anno[["sce"]]$SingleR_annotations_collapsed <- forcats::fct_lump_n(
anno[["sce"]]$SingleR_annotations,
max_celltypes
)
# plot if TRUE
if (viz) {
# there is no interesting color palette harmonization among library colors here!
plot_SingleR(anno, "SingleR_annotations_collapsed")
}
return(anno[["sce"]])
}
Here we go!
# Read in all SCE files
sce_list <- purrr::map(
sce_file_paths,
readr::read_rds
)
# Annotate them all, popping out some viz along the way
sce_list_annotated <- purrr::map(sce_list, run_SingleR, viz = TRUE)
[1] "SCPCL000295"
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
Using compatibility `.name_repair`.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
[1] "SCPCL000296"
[1] "SCPCL000297"
[1] "SCPCL000298"
[1] "SCPCL000299"
# for sanity during dev:
readr::write_rds(sce_list_annotated, "sce_list_annotated.rds")
Now let’s plot, to start, the fastMNN UMAP but with celltype annotations from individual libraries. As is deeply commented all over the place, we definitely need to infuse palettes with biology!
# helper to count the number of celltypes across all libraries
count_celltypes <- function(sce) {
tibble::tibble(celltype = colData(sce)$SingleR_annotations ) %>%
dplyr::count(celltype) %>%
dplyr::arrange(-n) %>%
tidyr::drop_na()
}
# Celltype order based on **overall** counts for all pooled annotations
celltype_order <- purrr::map_df(sce_list_annotated, count_celltypes) %>%
dplyr::group_by(celltype) %>%
dplyr::summarize(celltype_counts = sum(n)) %>%
dplyr::arrange(-celltype_counts) %>%
dplyr::pull(celltype)
# Again we'd eventually like some biology to go into the color choices!
cell_colors <- rainbow(length(celltype_order))
# Find all the annotations for a combined color palette:
# This function is also used later for ADT extraction
extract_annotation <- function(sce, annotation_column_name) {
tibble::tibble(batch = unique(metadata(sce)$library),
celltype = colData(sce)[,annotation_column_name],
cell_barcode = rownames(colData(sce))) %>%
dplyr::mutate(cell_name = paste(cell_barcode, batch, sep = "-"))
}
# integrated UMAP with individual library cell annotations colored
purrr::map_df(sce_list_annotated, extract_annotation, "SingleR_annotations") %>%
dplyr::inner_join(
tibble::as_tibble(colData(integrated_sce), rownames = "cell_name")
) %>%
dplyr::bind_cols(
as.data.frame(reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")))
) %>%
dplyr::mutate(UMAP1 = V1, UMAP2 = V2) %>%
# And let's make a UMAP!
ggplot() +
aes(x = UMAP1, y = UMAP2, color = celltype) +
geom_point(size = 0.2, alpha = 0.3) +
# we definitely want some biology here!
scale_color_manual(values = cell_colors)
Joining, by = c("batch", "cell_name")
After annotating, we’d like to validate with ADTs.
find_max_adt <- function(sce) {
max_adt_indices <- apply(
logcounts(altExp(sce)),
2,
which.max
)
# associated rownames which are the ADT names
max_adt <- rownames(altExp(sce))[max_adt_indices]
# add back to sce and return
sce$max_adt <- max_adt
return(sce)
}
sce_list_adt <- purrr::map(sce_list_annotated, find_max_adt)
# What are the unique ADTs for each library?
purrr::map(
lapply(sce_list_adt, `[[`, "max_adt"),
unique
)
[[1]]
[1] "CD36" "CD45" "CD33" "CD71" "CD32" "CD45RA" "CD123"
[8] "CD3" "CD90" "CD19" "CD235ab" "CD10" "CD7"
[[2]]
[1] "CD33" "CD45" "CD36" "CD10" "CD123" "CD71" "CD366" "CD45RA"
[[3]]
[1] "CD45" "CD49f" "CD71" "CD33" "CD32" "CD123" "CD10" "CD36"
[9] "CD7" "CD274" "CD45RA" "CD90" "CD3"
[[4]]
[1] "CD123" "CD45RA" "CD71" "CD45" "CD36" "CD32" "CD19" "CD7"
[9] "CD49f" "CD3" "CD10" "CD90" "CD133" "CD25" "CD366" "CD274"
[[5]]
[1] "CD45" "CD49f" "CD71" "CD32" "CD235ab" "CD90" "CD45RA"
[8] "CD7" "CD3" "CD274" "CD123" "CD25" "CD366" "CD36"
[15] "CD10" "CD279"
CD36
CD45
CD33
CD71
CD32
CD45RA
CD123
CD3
CD90
CD19
CD235ab
CD10
CD7
CD33
CD45
CD36
CD10
CD123
CD71
CD366
CD45RA
CD45
CD49f
CD71
CD33
CD32
CD123
CD10
CD36
CD7
CD274
CD45RA
CD90
CD3
CD123
CD45RA
CD71
CD45
CD36
CD32
CD19
CD7
CD49f
CD3
CD10
CD90
CD133
CD25
CD366
CD274
CD45
CD49f
CD71
CD32
CD235ab
CD90
CD45RA
CD7
CD3
CD274
CD123
CD25
CD366
CD36
CD10
CD279
# What are the ADT distributions for each library?
purrr::map(
lapply(sce_list_adt, `[[`, "max_adt"),
table
)
[[1]]
CD10 CD123 CD19 CD235ab CD3 CD32 CD33 CD36 CD45 CD45RA
6 366 7 68 5 836 2445 292 626 15
CD7 CD71 CD90
13 19 2
[[2]]
CD10 CD123 CD33 CD36 CD366 CD45 CD45RA CD71
2 9 2845 7 1 289 17 4
[[3]]
CD10 CD123 CD274 CD3 CD32 CD33 CD36 CD45 CD45RA CD49f CD7
6 10 4 1 56 4 1 3289 2 299 2
CD71 CD90
10 3
[[4]]
CD10 CD123 CD133 CD19 CD25 CD274 CD3 CD32 CD36 CD366 CD45
1 1747 1 14 4 3 12 17 31 1 726
CD45RA CD49f CD7 CD71 CD90
1443 22 76 143 63
[[5]]
CD10 CD123 CD235ab CD25 CD274 CD279 CD3 CD32 CD36 CD366
1 18 3 4 9 4 16 5 1 1
CD45 CD45RA CD49f CD7 CD71 CD90
548 1 109 14 17 38
# What's the most common ADT distributions for each library?
most_common <- function(a_list) {
sort(table(a_list), decreasing = TRUE)[1] %>%
names()
}
purrr::map(
lapply(sce_list_adt, `[[`, "max_adt"),
most_common
)
[[1]]
[1] "CD33"
[[2]]
[1] "CD33"
[[3]]
[1] "CD45"
[[4]]
[1] "CD123"
[[5]]
[1] "CD45"
CD33
CD33
CD45
CD123
CD45
UMAP of the ADTs:
purrr::map_df(sce_list_adt, extract_annotation, "max_adt") %>%
dplyr::inner_join(
tibble::as_tibble(colData(integrated_sce), rownames = "cell_name")
) %>%
dplyr::bind_cols(
as.data.frame(reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")))
) %>%
dplyr::mutate(UMAP1 = V1, UMAP2 = V2) %>%
# And let's make a UMAP!
ggplot() +
aes(x = UMAP1, y = UMAP2, color = celltype) +
geom_point(size = 0.2, alpha = 0.3)
Joining, by = c("batch", "cell_name")
Currently the code chunks in this section are not evaluated, but remain here for now for posterity.
To build an initial sense of what we can expect annotating with
SingleR, let’s just look at one SCE (arbitrarily chosen as
the first):
# For now let's just explore one!
sce_file <- sce_file_paths[[1]]
sce <- readr::read_rds(sce_file)
sce
The celldex package contains bulk RNA-Seq datasets for
use as reference. Since this is AML data, we’ll want a reference that
has a lot of blood information. According to the celldex,
the Human Primary Cell Atlas data will be our closest
match.
The HPCA reference consists of publicly available microarray datasets derived from human primary cells (Mabbott et al. 2013). Most of the labels refer to blood subpopulations but cell types from other tissues are also available.
# define reference with ensembl IDs to match our IDs
ref_se <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
# Predict cell types
# If we change to `labels = ref_se$label.fine`, we'll get more
# fine-grained annotations with subtypes etc.
preds_hpca <- SingleR::SingleR(
# dataset we want to annotate
test = sce,
# reference dataset
ref = ref_se,
# label.main is broad
labels = ref_se$label.main)
# Results into a tibble:
preds_df <- tibble::as_tibble(preds_hpca$labels) %>%
dplyr::rename(celltype = value) %>%
dplyr::mutate(cell_barcode = rownames(preds_hpca),
reference = "hpca")
# Save the celltypes:
hpca_celltypes <- unique(ref_se$label.main)
And a heatmap version, showing all celltypes. The bar the top shows the final assignment, which are the rows with highest scores.
SingleR::plotScoreHeatmap(preds_hpca)
But can’t hurt to see how this compares to the
Blueprint/ENCODE reference:
The Blueprint/ENCODE reference consists of bulk RNA-seq data for pure stroma and immune cells generated by Blueprint (Martens and Stunnenberg 2013) and ENCODE projects (The ENCODE Project Consortium 2012).
# define reference with ensembl IDs to match our IDs
ref_se <- celldex::BlueprintEncodeData(ensembl = TRUE)
# Predict cell types, broadly
preds_blue_enc <- SingleR::SingleR(
# dataset we want to annotate
test = sce,
# reference dataset
ref = ref_se,
# label.main is broad
labels = ref_se$label.main)
# Results into a tibble:
preds_df <- preds_df %>%
dplyr::bind_rows(
tibble::as_tibble(preds_blue_enc$labels) %>%
dplyr::rename(celltype = value) %>%
dplyr::mutate(cell_barcode = rownames(preds_blue_enc),
reference = "blueprint_encode")
)
# Save the celltypes:
blueprint_celltypes <- unique(ref_se$label.main)
# And the heatmap:
SingleR::plotScoreHeatmap(preds_blue_enc)
What did the references predict?
# How many of each celltype?
preds_df %>%
dplyr::filter(reference == "hpca") %>%
dplyr::count(celltype) %>%
dplyr::arrange(-n)
preds_df %>%
dplyr::filter(reference == "blueprint_encode") %>%
dplyr::count(celltype) %>%
dplyr::arrange(-n)
We’ll have to harmonize the celltype names between references to do a robust comparison, but from a very quick glance, overlap is thinner than one might like. That said, we don’t necessarily expect Blueprint/ENCODE to do particularly well anyways!
For a clearer comparison, we’ll harmonize predicted celltypes, but let’s just focus on exactly matching celltypes as a start -
# Let's see what we have here:
sort(hpca_celltypes)
sort(blueprint_celltypes)
# Manually compiled data rame of celltypes roughly present in BOTH references
# for now leaving the Pre/Pro B-cells out
shared_celltypes_df <- tibble::tribble(
~hpca_celltype, ~blueprint_celltype,
# Both references have it:
"Astrocyte", "Astrocytes",
"B_cell", "B-cells",
"Chondrocytes","Chondrocytes",
"DC", "DC",
"Endothelial_cells", "Endothelial cells",
"Epithelial_cells", "Epithelial cells",
"Fibroblasts", "Fibroblasts",
"Keratinocytes", "Keratinocytes",
"Macrophage", "Macrophages",
"Monocyte", "Monocytes",
"Neurons", "Neurons",
"Neutrophils", "Neutrophils",
"NK_cell", "NK cells",
"Smooth_muscle_cells", "Smooth muscle",
# two groups - collapse back to overall T cells
"T_cells", "CD8+ T-cells",
"T_cells", "CD4+ T-cells",
# two groups of HSCs, again, collapse back to overall
"HSC_-G-CSF", "HSC",
"HSC_CD34+", "HSC"
) %>%
dplyr::mutate(harmonized_celltype = ifelse(blueprint_celltype == "HSC",
blueprint_celltype,
hpca_celltype))
harmonize_celltypes <- function(preds_df, reference_name) {
if (reference_name == "hpca") {
shared_celltypes_colname <- rlang::sym("hpca_celltype")
} else {
shared_celltypes_colname <- rlang::sym("blueprint_celltype")
}
filtered_preds_df <- preds_df %>%
dplyr::filter(reference == reference_name)
sym_reference_name <- rlang::sym(reference_name)
filtered_preds_df %>%
dplyr::inner_join(
dplyr::select(
shared_celltypes_df,
celltype = {{shared_celltypes_colname}},
harmonized_celltype
)
) %>%
dplyr::select(cell_barcode, {{sym_reference_name}} := harmonized_celltype) %>%
dplyr::right_join(filtered_preds_df) %>%
dplyr::mutate({{reference_name}} := dplyr::if_else(
{{sym_reference_name}} %in% shared_celltypes_df$harmonized_celltype,
{{sym_reference_name}},
celltype
)) %>%
dplyr::select(-celltype, -reference)
}
preds_df_harmonized <- harmonize_celltypes(preds_df, "blueprint_encode") %>%
dplyr::left_join(
harmonize_celltypes(preds_df, "hpca")
)
# How often do they match?
preds_df_harmonized %>%
dplyr::summarize(percent_same = sum(blueprint_encode == hpca) / dplyr::n() )
# Where do they not match?
preds_df_harmonized %>%
dplyr::filter(hpca != blueprint_encode)
# Are there specific combinations that get differently annotated?
preds_df_harmonized %>%
dplyr::filter(hpca != blueprint_encode) %>%
dplyr::select(-cell_barcode) %>%
dplyr::count(blueprint_encode, hpca) %>%
dplyr::arrange(-n)
sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur ... 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] ensembldb_2.20.2 AnnotationFilter_1.20.0
[3] GenomicFeatures_1.48.3 AnnotationDbi_1.58.0
[5] magrittr_2.0.3 ggplot2_3.3.6
[7] celldex_1.6.0 SingleR_1.10.0
[9] SingleCellExperiment_1.18.0 SummarizedExperiment_1.26.1
[11] Biobase_2.56.0 GenomicRanges_1.48.0
[13] GenomeInfoDb_1.32.3 IRanges_2.30.0
[15] S4Vectors_0.34.0 BiocGenerics_0.42.0
[17] MatrixGenerics_1.8.1 matrixStats_0.62.0
loaded via a namespace (and not attached):
[1] AnnotationHub_3.4.0 BiocFileCache_2.4.0
[3] lazyeval_0.2.2 splines_4.2.1
[5] BiocParallel_1.30.3 digest_0.6.29
[7] htmltools_0.5.3 viridis_0.6.2
[9] fansi_1.0.3 memoise_2.0.1
[11] ScaledMatrix_1.4.0 tzdb_0.3.0
[13] Biostrings_2.64.0 miQC_1.4.0
[15] readr_2.1.2 vroom_1.5.7
[17] prettyunits_1.1.1 colorspace_2.0-3
[19] blob_1.2.3 rappdirs_0.3.3
[21] xfun_0.32 dplyr_1.0.9
[23] crayon_1.5.1 RCurl_1.98-1.8
[25] jsonlite_1.8.0 glue_1.6.2
[27] gtable_0.3.0 zlibbioc_1.42.0
[29] XVector_0.36.0 DelayedArray_0.22.0
[31] BiocSingular_1.12.0 scales_1.2.0
[33] pheatmap_1.0.12 DBI_1.1.3
[35] Rcpp_1.0.9 viridisLite_0.4.0
[37] xtable_1.8-4 progress_1.2.2
[39] bit_4.0.4 rsvd_1.0.5
[41] httr_1.4.3 RColorBrewer_1.1-3
[43] modeltools_0.2-23 ellipsis_0.3.2
[45] pkgconfig_2.0.3 XML_3.99-0.10
[47] flexmix_2.3-18 farver_2.1.1
[49] nnet_7.3-17 sass_0.4.2
[51] dbplyr_2.2.1 utf8_1.2.2
[53] here_1.0.1 tidyselect_1.1.2
[55] labeling_0.4.2 rlang_1.0.4
[57] later_1.3.0 munsell_0.5.0
[59] BiocVersion_3.15.2 tools_4.2.1
[61] cachem_1.0.6 cli_3.3.0
[63] generics_0.1.3 RSQLite_2.2.15
[65] ExperimentHub_2.4.0 evaluate_0.16
[67] stringr_1.4.0 fastmap_1.1.0
[69] yaml_2.3.5 knitr_1.39
[71] bit64_4.0.5 purrr_0.3.4
[73] KEGGREST_1.36.3 sparseMatrixStats_1.8.0
[75] mime_0.12 xml2_1.3.3
[77] biomaRt_2.52.0 compiler_4.2.1
[79] filelock_1.0.2 curl_4.3.2
[81] png_0.1-7 interactiveDisplayBase_1.34.0
[83] tibble_3.1.8 bslib_0.4.0
[85] stringi_1.7.8 highr_0.9
[87] forcats_0.5.2 lattice_0.20-45
[89] ProtGenerics_1.28.0 Matrix_1.4-1
[91] vctrs_0.4.1 pillar_1.8.0
[93] lifecycle_1.0.1 BiocManager_1.30.18
[95] jquerylib_0.1.4 BiocNeighbors_1.14.0
[97] bitops_1.0-7 irlba_2.3.5
[99] httpuv_1.6.5 rtracklayer_1.56.1
[101] R6_2.5.1 BiocIO_1.6.0
[103] promises_1.2.0.1 renv_0.15.5
[105] gridExtra_2.3 codetools_0.2-18
[107] assertthat_0.2.1 rprojroot_2.0.3
[109] rjson_0.2.21 withr_2.5.0
[111] GenomicAlignments_1.32.1 Rsamtools_2.12.0
[113] GenomeInfoDbData_1.2.8 parallel_4.2.1
[115] hms_1.1.1 grid_4.2.1
[117] beachmat_2.12.0 tidyr_1.2.0
[119] rmarkdown_2.14 DelayedMatrixStats_1.18.0
[121] shiny_1.7.2 restfulr_0.0.15